{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.preprocessing import LabelEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from rules import normalized_chars\n",
    "import random\n",
    "import re\n",
    "from unidecode import unidecode\n",
    "\n",
    "laughing = {\n",
    "    'huhu',\n",
    "    'haha',\n",
    "    'gagaga',\n",
    "    'hihi',\n",
    "    'wkawka',\n",
    "    'wkwk',\n",
    "    'kiki',\n",
    "    'keke',\n",
    "    'huehue',\n",
    "    'hshs',\n",
    "    'hoho',\n",
    "    'hewhew',\n",
    "    'uwu',\n",
    "    'sksk',\n",
    "    'ksks',\n",
    "    'gituu',\n",
    "    'gitu',\n",
    "    'mmeeooww',\n",
    "    'meow',\n",
    "    'alhamdulillah',\n",
    "    'muah',\n",
    "    'mmuahh',\n",
    "    'hehe',\n",
    "    'salamramadhan',\n",
    "    'happywomensday',\n",
    "    'jahagaha',\n",
    "    'ahakss',\n",
    "    'ahksk'\n",
    "}\n",
    "\n",
    "def make_cleaning(s, c_dict):\n",
    "    s = s.translate(c_dict)\n",
    "    return s\n",
    "\n",
    "def cleaning(string):\n",
    "    \"\"\"\n",
    "    use by any transformer model before tokenization\n",
    "    \"\"\"\n",
    "    string = unidecode(string)\n",
    "    \n",
    "    string = ' '.join(\n",
    "        [make_cleaning(w, normalized_chars) for w in string.split()]\n",
    "    )\n",
    "    string = re.sub('\\(dot\\)', '.', string)\n",
    "    string = (\n",
    "        re.sub(re.findall(r'\\<a(.*?)\\>', string)[0], '', string)\n",
    "        if (len(re.findall(r'\\<a (.*?)\\>', string)) > 0)\n",
    "        and ('href' in re.findall(r'\\<a (.*?)\\>', string)[0])\n",
    "        else string\n",
    "    )\n",
    "    string = re.sub(\n",
    "        r'\\w+:\\/{2}[\\d\\w-]+(\\.[\\d\\w-]+)*(?:(?:\\/[^\\s/]*))*', ' ', string\n",
    "    )\n",
    "    \n",
    "    chars = '.,/'\n",
    "    for c in chars:\n",
    "        string = string.replace(c, f' {c} ')\n",
    "        \n",
    "    string = re.sub(r'[ ]+', ' ', string).strip().split()\n",
    "    string = [w for w in string if w[0] != '@']\n",
    "    x = []\n",
    "    for word in string:\n",
    "        word = word.lower()\n",
    "        if any([laugh in word for laugh in laughing]):\n",
    "            if random.random() >= 0.5:\n",
    "                x.append(word)\n",
    "        else:\n",
    "            x.append(word)\n",
    "    string = [w.title() if w[0].isupper() else w for w in x]\n",
    "    return ' '.join(string)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['severe toxic',\n",
       " 'obscene',\n",
       " 'identity attack',\n",
       " 'insult',\n",
       " 'threat',\n",
       " 'asian',\n",
       " 'atheist',\n",
       " 'bisexual',\n",
       " 'black',\n",
       " 'buddhist',\n",
       " 'christian',\n",
       " 'female',\n",
       " 'heterosexual',\n",
       " 'indian',\n",
       " 'homosexual, gay or lesbian',\n",
       " 'intellectual or learning disability',\n",
       " 'jewish',\n",
       " 'latino',\n",
       " 'male',\n",
       " 'muslim',\n",
       " 'other disability',\n",
       " 'other gender',\n",
       " 'other race or ethnicity',\n",
       " 'other religion',\n",
       " 'other sexual orientation',\n",
       " 'physical disability',\n",
       " 'psychiatric or mental illness',\n",
       " 'transgender',\n",
       " 'white',\n",
       " 'malay',\n",
       " 'chinese']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "labels = \"\"\"\n",
    "1. severe toxic\n",
    "2. obscene\n",
    "3. identity attack\n",
    "4. insult\n",
    "5. threat\n",
    "6. asian\n",
    "7. atheist\n",
    "8. bisexual\n",
    "9. black\n",
    "10. buddhist\n",
    "11. christian\n",
    "12. female\n",
    "13. heterosexual\n",
    "14. indian\n",
    "15. homosexual, gay or lesbian\n",
    "16. intellectual or learning disability\n",
    "17. jewish\n",
    "18. latino\n",
    "19. male\n",
    "20. muslim\n",
    "21. other disability\n",
    "22. other gender\n",
    "23. other race or ethnicity\n",
    "24. other religion\n",
    "25. other sexual orientation\n",
    "26. physical disability\n",
    "27. psychiatric or mental illness\n",
    "28. transgender\n",
    "29. white\n",
    "30. malay\n",
    "31. chinese\n",
    "\"\"\"\n",
    "labels = [l.split('. ')[1].strip() for l in labels.split('\\n') if len(l)]\n",
    "labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['../toxicity/translated-1750000.json',\n",
       " '../toxicity/translated-1450000.json',\n",
       " '../toxicity/translated-700000.json',\n",
       " '../toxicity/translated-350000.json',\n",
       " '../toxicity/translated-600000.json',\n",
       " '../toxicity/translated-900000.json',\n",
       " '../toxicity/translated-1000000.json',\n",
       " '../toxicity/translated-1100000.json',\n",
       " '../toxicity/translated-550000.json',\n",
       " '../toxicity/translated-150000.json',\n",
       " '../toxicity/translated-500000.json',\n",
       " '../toxicity/translated-1500000.json',\n",
       " '../toxicity/translated-1150000.json',\n",
       " '../toxicity/translated-750000.json',\n",
       " '../toxicity/translated-850000.json',\n",
       " '../toxicity/translated-1650000.json',\n",
       " '../toxicity/translated-300000.json',\n",
       " '../toxicity/translated-650000.json',\n",
       " '../toxicity/translated-950000.json',\n",
       " '../toxicity/translated-250000.json',\n",
       " '../toxicity/translated-1600000.json',\n",
       " '../toxicity/translated-0.json',\n",
       " '../toxicity/translated-1550000.json',\n",
       " '../toxicity/translated-1800000.json',\n",
       " '../toxicity/translated-450000.json',\n",
       " '../toxicity/translated-50000.json',\n",
       " '../toxicity/translated-1050000.json',\n",
       " '../toxicity/translated-1200000.json',\n",
       " '../toxicity/translated-1700000.json',\n",
       " '../toxicity/translated-400000.json']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import glob\n",
    "\n",
    "files = glob.glob('../toxicity/translated*')\n",
    "files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "../toxicity/translated-1750000.json\n",
      "../toxicity/translated-1450000.json\n",
      "../toxicity/translated-700000.json\n",
      "../toxicity/translated-350000.json\n",
      "../toxicity/translated-600000.json\n",
      "../toxicity/translated-900000.json\n",
      "../toxicity/translated-1000000.json\n",
      "../toxicity/translated-1100000.json\n",
      "../toxicity/translated-550000.json\n",
      "../toxicity/translated-150000.json\n",
      "../toxicity/translated-500000.json\n",
      "../toxicity/translated-1500000.json\n",
      "../toxicity/translated-1150000.json\n",
      "../toxicity/translated-750000.json\n",
      "../toxicity/translated-850000.json\n",
      "../toxicity/translated-1650000.json\n",
      "../toxicity/translated-300000.json\n",
      "../toxicity/translated-650000.json\n",
      "../toxicity/translated-950000.json\n",
      "../toxicity/translated-250000.json\n",
      "../toxicity/translated-1600000.json\n",
      "../toxicity/translated-0.json\n",
      "../toxicity/translated-1550000.json\n",
      "../toxicity/translated-1800000.json\n",
      "../toxicity/translated-450000.json\n",
      "../toxicity/translated-50000.json\n",
      "../toxicity/translated-1050000.json\n",
      "../toxicity/translated-1200000.json\n",
      "../toxicity/translated-1700000.json\n",
      "../toxicity/translated-400000.json\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "1401054"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import json\n",
    "\n",
    "X, Y = [], []\n",
    "\n",
    "for file in files:\n",
    "    print(file)\n",
    "    with open(file) as fopen:\n",
    "        f = json.load(fopen)\n",
    "        for row in f:\n",
    "            if len(row[1]) == 29:\n",
    "                X.append(row[0])\n",
    "                Y.append(row[1] + [0, 0])\n",
    "        \n",
    "    \n",
    "len(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "rejected_labels = ['black', 'white', 'jewish', 'latino']\n",
    "[labels.index(l) for l in rejected_labels]\n",
    "labels = [l for l in labels if l not in rejected_labels]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "ydf = pd.DataFrame(np.array(Y))\n",
    "ydf = ydf.loc[(ydf[8] == 0) & (ydf[28] == 0) & (ydf[16] == 0) & (ydf[17] == 0)]\n",
    "ydf = ydf.drop([8, 28, 16, 17], axis = 1)\n",
    "ix = ydf.index.tolist()\n",
    "Y = ydf.values.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = [X[i] for i in ix]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "mapping = {'severe_toxic': 'severe toxic', 'identity_hate': 'identity attack',\n",
    "          'toxic': 'severe toxic', 'melayu': 'malay', 'cina': 'chinese', 'india': 'indian'}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_onehot(tags, depth = len(labels)):\n",
    "    onehot = [0] * depth\n",
    "    for tag in tags:\n",
    "        onehot[labels.index(tag)] = 1\n",
    "    return onehot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "melayu 84851\n",
      "cina 43956\n",
      "india 20208\n"
     ]
    }
   ],
   "source": [
    "with open('../toxicity/kaum.json') as fopen:\n",
    "    kaum = json.load(fopen)\n",
    "    \n",
    "for k, v in kaum.items():\n",
    "    print(k, len(v))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('../toxicity/weak-learning-toxicity.json') as fopen:\n",
    "    scores = json.load(fopen)\n",
    "    \n",
    "for k, v in scores.items():\n",
    "    for no in range(len(v)):\n",
    "        tags = []\n",
    "        for l, v_ in v[no].items():\n",
    "            if round(v_) == 1:\n",
    "                tags.append(mapping.get(l, l))\n",
    "        tags.append(mapping[k])\n",
    "        Y.append(generate_onehot(tags))\n",
    "        X.append(kaum[k][no])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1510055/1510055 [04:35<00:00, 5488.19it/s] \n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "for i in tqdm(range(len(X))):\n",
    "    X[i] = cleaning(X[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1510055/1510055 [00:01<00:00, 1251440.85it/s]\n"
     ]
    }
   ],
   "source": [
    "actual_t, actual_l = [], []\n",
    "\n",
    "for i in tqdm(range(len(X))):\n",
    "    if len(X[i]) > 2:\n",
    "        actual_t.append(X[i])\n",
    "        actual_l.append(Y[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('combined.txt', 'w') as fopen:\n",
    "    fopen.write('\\n'.join(actual_t))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "import youtokentome as yttm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 26.8 s, sys: 5.77 s, total: 32.5 s\n",
      "Wall time: 7.5 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "bpe = yttm.BPE.train(data='combined.txt', \n",
    "               vocab_size=60000, model='toxic.model')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "60000"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab = {v: i for i, v in enumerate(bpe.vocab())}\n",
    "rev_vocab = {i: v for i, v in enumerate(bpe.vocab())}\n",
    "len(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "r = re.compile(r'[\\S]+').findall"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "subs = [' '.join(s) for s in bpe.encode(actual_t, output_type=yttm.OutputType.SUBWORD)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "tfidf = TfidfVectorizer(vocabulary = vocab, token_pattern = r'[\\S]+').fit(subs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "with open('tfidf-toxic.pkl','wb') as fopen:\n",
    "    pickle.dump(tfidf,fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "vector = tfidf.transform(subs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "Y = np.around(np.array(actual_l))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((1201860, 60000), (300465, 60000))"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "train_X, test_X, train_Y, test_Y = train_test_split(vector, Y, test_size = 0.2)\n",
    "train_X.shape, test_X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.naive_bayes import ComplementNB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.multiclass import OneVsRestClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/husein/.local/lib/python3.6/site-packages/sklearn/multiclass.py:75: UserWarning: Label not 17 is present in all training examples.\n",
      "  str(classes[c]))\n",
      "/home/husein/.local/lib/python3.6/site-packages/sklearn/multiclass.py:75: UserWarning: Label not 18 is present in all training examples.\n",
      "  str(classes[c]))\n"
     ]
    }
   ],
   "source": [
    "multinomial = OneVsRestClassifier(ComplementNB()).fit(train_X, train_Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/husein/.local/lib/python3.6/site-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, msg_start, len(result))\n",
      "/home/husein/.local/lib/python3.6/site-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, msg_start, len(result))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                     precision    recall  f1-score   support\n",
      "\n",
      "                       severe toxic    0.31967   0.99343   0.48370     39401\n",
      "                            obscene    0.06234   0.68947   0.11434     11287\n",
      "                    identity attack    0.03492   0.61682   0.06610      5624\n",
      "                             insult    0.17200   0.73748   0.27894     50732\n",
      "                             threat    0.00799   0.12224   0.01500      1718\n",
      "                              asian    0.00180   0.03686   0.00344      1492\n",
      "                            atheist    0.00176   0.05402   0.00341       722\n",
      "                           bisexual    0.00007   0.01136   0.00014        88\n",
      "                           buddhist    0.00035   0.02885   0.00070       208\n",
      "                          christian    0.14597   0.88664   0.25067     18754\n",
      "                             female    0.13707   0.80804   0.23438     27850\n",
      "                       heterosexual    0.00131   0.04887   0.00255       532\n",
      "                             indian    0.15246   0.97626   0.26373     16425\n",
      "         homosexual, gay or lesbian    0.05431   0.52286   0.09840      5577\n",
      "intellectual or learning disability    0.00000   0.00000   0.00000        26\n",
      "                               male    0.08945   0.61875   0.15630     19953\n",
      "                             muslim    0.08552   0.63404   0.15071     10247\n",
      "                   other disability    0.00000   0.00000   0.00000         0\n",
      "                       other gender    0.00000   0.00000   0.00000         0\n",
      "            other race or ethnicity    0.00007   0.03226   0.00015        31\n",
      "                     other religion    0.00000   0.00000   0.00000        34\n",
      "           other sexual orientation    0.00000   0.00000   0.00000         2\n",
      "                physical disability    0.00000   0.00000   0.00000        19\n",
      "      psychiatric or mental illness    0.01110   0.14322   0.02060      2353\n",
      "                        transgender    0.00238   0.06331   0.00458       853\n",
      "                              malay    0.55148   0.99407   0.70941     67807\n",
      "                            chinese    0.29848   0.99167   0.45885     35163\n",
      "\n",
      "                          micro avg    0.15726   0.84487   0.26516    316898\n",
      "                          macro avg    0.07891   0.37076   0.12282    316898\n",
      "                       weighted avg    0.25933   0.84487   0.37857    316898\n",
      "                        samples avg    0.08185   0.16444   0.09732    316898\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/husein/.local/lib/python3.6/site-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, msg_start, len(result))\n",
      "/home/husein/.local/lib/python3.6/site-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in samples with no true labels. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, msg_start, len(result))\n"
     ]
    }
   ],
   "source": [
    "print(\n",
    "    metrics.classification_report(\n",
    "        train_Y,\n",
    "        multinomial.predict(train_X),\n",
    "        target_names=labels,digits=5\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                     precision    recall  f1-score   support\n",
      "\n",
      "                       severe toxic    0.32096   0.99468   0.48532      9955\n",
      "                            obscene    0.06031   0.68096   0.11081      2799\n",
      "                    identity attack    0.03312   0.60086   0.06277      1393\n",
      "                             insult    0.15655   0.69002   0.25519     12575\n",
      "                             threat    0.00661   0.11058   0.01247       416\n",
      "                              asian    0.00087   0.01799   0.00166       389\n",
      "                            atheist    0.00137   0.04494   0.00266       178\n",
      "                           bisexual    0.00052   0.08333   0.00104        24\n",
      "                           buddhist    0.00000   0.00000   0.00000        45\n",
      "                          christian    0.13652   0.86153   0.23570      4622\n",
      "                             female    0.12714   0.78073   0.21867      6891\n",
      "                       heterosexual    0.00153   0.06299   0.00299       127\n",
      "                             indian    0.14732   0.97509   0.25597      4014\n",
      "         homosexual, gay or lesbian    0.04442   0.45581   0.08095      1369\n",
      "intellectual or learning disability    0.00000   0.00000   0.00000         6\n",
      "                               male    0.08106   0.58298   0.14233      4947\n",
      "                             muslim    0.07845   0.59531   0.13863      2602\n",
      "                   other disability    0.00000   0.00000   0.00000         0\n",
      "                       other gender    0.00000   0.00000   0.00000         2\n",
      "            other race or ethnicity    0.00000   0.00000   0.00000         7\n",
      "                     other religion    0.00000   0.00000   0.00000         8\n",
      "           other sexual orientation    0.00000   0.00000   0.00000         1\n",
      "                physical disability    0.00000   0.00000   0.00000         2\n",
      "      psychiatric or mental illness    0.00720   0.09651   0.01340       601\n",
      "                        transgender    0.00249   0.06608   0.00481       227\n",
      "                              malay    0.54919   0.99337   0.70733     17044\n",
      "                            chinese    0.29545   0.99079   0.45517      8793\n",
      "\n",
      "                          micro avg    0.14989   0.82799   0.25383     79037\n",
      "                          macro avg    0.07597   0.35869   0.11807     79037\n",
      "                       weighted avg    0.25444   0.82799   0.37086     79037\n",
      "                        samples avg    0.07772   0.16003   0.09295     79037\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(\n",
    "    metrics.classification_report(\n",
    "        test_Y,\n",
    "        multinomial.predict(test_X),\n",
    "        target_names=labels,digits=5\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('multinomial-toxic.pkl','wb') as fopen:\n",
    "    pickle.dump(multinomial,fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "import boto3\n",
    "s3 = boto3.client('s3')\n",
    "bucketName = 'huseinhouse-storage'\n",
    "\n",
    "Key = 'multinomial-toxic.pkl'\n",
    "outPutname = \"v34/toxicity/multinomial.pkl\"\n",
    "s3.upload_file(Key,bucketName,outPutname)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "Key = 'tfidf-toxic.pkl'\n",
    "outPutname = \"v34/toxicity/tfidf.pkl\"\n",
    "s3.upload_file(Key,bucketName,outPutname)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "Key = 'toxic.model'\n",
    "outPutname = \"v34/toxicity/youtokentome.model\"\n",
    "s3.upload_file(Key,bucketName,outPutname)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
